810 lines (809 with data), 25.0 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import pickle\n",
"import random\n",
"import glob\n",
"import datetime\n",
"import pandas as pd\n",
"import numpy as np\n",
"import cv2\n",
"import pydicom\n",
"from tqdm import tqdm\n",
"from joblib import delayed, Parallel\n",
"import zipfile\n",
"from pydicom.filebase import DicomBytesIO\n",
"import sys\n",
"from PIL import Image\n",
"import cv2\n",
"\n",
"import keras\n",
"from keras.models import model_from_json\n",
"import tensorflow as tf\n",
"from keras.models import Sequential, Model\n",
"from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, GlobalAveragePooling2D\n",
"from keras.applications.inception_v3 import InceptionV3\n",
"\n",
"# importing pyplot and image from matplotlib \n",
"import matplotlib.pyplot as plt \n",
"import matplotlib.image as mpimg \n",
"\n",
"\n",
"from keras.preprocessing import image\n",
"import albumentations as A"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>Diagnosis</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" <tr>\n",
" <th>ImageID</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ID_000012eaf</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_000039fa0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_00005679d</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_00008ce3c</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_0000950d7</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Diagnosis any epidural intraparenchymal intraventricular subarachnoid \\\n",
"ImageID \n",
"ID_000012eaf 0 0 0 0 0 \n",
"ID_000039fa0 0 0 0 0 0 \n",
"ID_00005679d 0 0 0 0 0 \n",
"ID_00008ce3c 0 0 0 0 0 \n",
"ID_0000950d7 0 0 0 0 0 \n",
"\n",
"Diagnosis subdural \n",
"ImageID \n",
"ID_000012eaf 0 \n",
"ID_000039fa0 0 \n",
"ID_00005679d 0 \n",
"ID_00008ce3c 0 \n",
"ID_0000950d7 0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df = pd.read_csv(f'{base_url}/stage_2_train.csv').drop_duplicates()\n",
"train_df['ImageID'] = train_df['ID'].str.slice(stop=12)\n",
"train_df['Diagnosis'] = train_df['ID'].str.slice(start=13)\n",
"train_labels = train_df.pivot(index=\"ImageID\", columns=\"Diagnosis\", values=\"Label\")\n",
"train_labels.head()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"train_metadata = pd.read_parquet(f'{base_url}/train_metadata.parquet.gzip')\n",
"test_metadata = pd.read_parquet(f'{base_url}/test_metadata.parquet.gzip')\n",
"\n",
"train_metadata[\"Dataset\"] = \"train\"\n",
"test_metadata[\"Dataset\"] = \"test\"\n",
"\n",
"train_metadata = train_metadata.join(train_labels)\n",
"\n",
"metadata = pd.concat([train_metadata, test_metadata], sort=True)\n",
"metadata.sort_values(by=\"ImagePositionPatient_2\", inplace=True, ascending=False)\n",
"metadata.drop(['ID_6431af929'],inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>BitsAllocated</th>\n",
" <th>BitsStored</th>\n",
" <th>Columns</th>\n",
" <th>Dataset</th>\n",
" <th>HighBit</th>\n",
" <th>ImageOrientationPatient_0</th>\n",
" <th>ImageOrientationPatient_1</th>\n",
" <th>ImageOrientationPatient_2</th>\n",
" <th>ImageOrientationPatient_3</th>\n",
" <th>ImageOrientationPatient_4</th>\n",
" <th>...</th>\n",
" <th>StudyID</th>\n",
" <th>StudyInstanceUID</th>\n",
" <th>WindowCenter</th>\n",
" <th>WindowWidth</th>\n",
" <th>any</th>\n",
" <th>epidural</th>\n",
" <th>intraparenchymal</th>\n",
" <th>intraventricular</th>\n",
" <th>subarachnoid</th>\n",
" <th>subdural</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Image</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>ID_24250ffbc</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>train</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.920505</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td>ID_6222a3935b</td>\n",
" <td>40.0</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_6e8c8d650</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>train</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.920505</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td>ID_6222a3935b</td>\n",
" <td>40.0</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_ac042708d</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>train</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.920505</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td>ID_6222a3935b</td>\n",
" <td>40.0</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_d1e2a17a9</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>train</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.927184</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td>ID_a5fb903898</td>\n",
" <td>40.0</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ID_e1a1b45a5</th>\n",
" <td>16</td>\n",
" <td>12</td>\n",
" <td>512</td>\n",
" <td>train</td>\n",
" <td>11</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.920505</td>\n",
" <td>...</td>\n",
" <td></td>\n",
" <td>ID_6222a3935b</td>\n",
" <td>40.0</td>\n",
" <td>80.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 36 columns</p>\n",
"</div>"
],
"text/plain": [
" BitsAllocated BitsStored Columns Dataset HighBit \\\n",
"Image \n",
"ID_24250ffbc 16 12 512 train 11 \n",
"ID_6e8c8d650 16 12 512 train 11 \n",
"ID_ac042708d 16 12 512 train 11 \n",
"ID_d1e2a17a9 16 12 512 train 11 \n",
"ID_e1a1b45a5 16 12 512 train 11 \n",
"\n",
" ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n",
"Image \n",
"ID_24250ffbc 1.0 0.0 \n",
"ID_6e8c8d650 1.0 0.0 \n",
"ID_ac042708d 1.0 0.0 \n",
"ID_d1e2a17a9 1.0 0.0 \n",
"ID_e1a1b45a5 1.0 0.0 \n",
"\n",
" ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n",
"Image \n",
"ID_24250ffbc 0.0 0.0 \n",
"ID_6e8c8d650 0.0 0.0 \n",
"ID_ac042708d 0.0 0.0 \n",
"ID_d1e2a17a9 0.0 0.0 \n",
"ID_e1a1b45a5 0.0 0.0 \n",
"\n",
" ImageOrientationPatient_4 ... StudyID StudyInstanceUID \\\n",
"Image ... \n",
"ID_24250ffbc 0.920505 ... ID_6222a3935b \n",
"ID_6e8c8d650 0.920505 ... ID_6222a3935b \n",
"ID_ac042708d 0.920505 ... ID_6222a3935b \n",
"ID_d1e2a17a9 0.927184 ... ID_a5fb903898 \n",
"ID_e1a1b45a5 0.920505 ... ID_6222a3935b \n",
"\n",
" WindowCenter WindowWidth any epidural intraparenchymal \\\n",
"Image \n",
"ID_24250ffbc 40.0 80.0 0.0 0.0 0.0 \n",
"ID_6e8c8d650 40.0 80.0 0.0 0.0 0.0 \n",
"ID_ac042708d 40.0 80.0 0.0 0.0 0.0 \n",
"ID_d1e2a17a9 40.0 80.0 0.0 0.0 0.0 \n",
"ID_e1a1b45a5 40.0 80.0 0.0 0.0 0.0 \n",
"\n",
" intraventricular subarachnoid subdural \n",
"Image \n",
"ID_24250ffbc 0.0 0.0 0.0 \n",
"ID_6e8c8d650 0.0 0.0 0.0 \n",
"ID_ac042708d 0.0 0.0 0.0 \n",
"ID_d1e2a17a9 0.0 0.0 0.0 \n",
"ID_e1a1b45a5 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 36 columns]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"metadata.head()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"test_df = metadata[metadata['Dataset'] == 'test'].iloc[:,:-6].drop(['Dataset'], axis= 1)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"train_df = metadata[metadata['Dataset'] == 'train'].drop(['Dataset'], axis= 1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(752802, 35)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.shape"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(752802, 6)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_y = train_df[['any','epidural','intraparenchymal','intraventricular', 'subarachnoid','subdural']]\n",
"train_y.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generator"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from skimage.io import imread\n",
"def get_input(path):\n",
" \n",
" img = imread( path )\n",
" \n",
" return( img )"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from tensorflow.keras.applications.vgg19 import preprocess_input\n",
"def get_output( path, label_file = None ):\n",
" \n",
" img_id = path.split('/')[-1].split('.')[0]\n",
" labels = label_file.loc[img_id].values\n",
" \n",
" return(labels)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"def preprocess_input( image ):\n",
" image_size = (224,224)\n",
" transform = A.Compose([\n",
" A.Resize(*image_size),\n",
" A.HorizontalFlip(),\n",
" A.OneOf([\n",
" A.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03,p=0.1),\n",
" A.GridDistortion(p=0.2),\n",
" A.OpticalDistortion(distort_limit=2, shift_limit=0.5,p=0.3),\n",
" ], p=0.3),\n",
" A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=10,p=0.1),\n",
" ])\n",
" random.seed(42) \n",
" augmented_image = cv2.resize(image, (224, 224), \n",
" interpolation = cv2.INTER_NEAREST) \n",
" #augmented_image = transform(image=image)['image']\n",
" #image = np.expand_dims(augmented_image, axis=0)\n",
" #image = preprocess_input(image)\n",
" return( augmented_image )"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"def image_generator(files, label_file, batch_size = 64):\n",
" train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n",
" while True:\n",
" # Select files (paths/indices) for the batch\n",
" for index in range(len(files)):\n",
" batch_paths = files[index*batch_size:(index+1)*batch_size]\n",
" batch_input = []\n",
" batch_output = [] \n",
"\n",
" # Read in each input, perform preprocessing and get labels\n",
" for input_path in batch_paths:\n",
" input = get_input( train_url +input_path + '.png')\n",
" output = get_output(train_url + input_path + '.png',label_file=label_file )\n",
"\n",
" input = preprocess_input(image=input)\n",
" batch_input += [ input ]\n",
" batch_output += [ output ]\n",
" # Return a tuple of (input, output) to feed the network\n",
" batch_x = np.array( batch_input )\n",
" batch_y = np.array( batch_output )\n",
"\n",
" yield( batch_x, batch_y )"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"train_generator = image_generator(train_df.index, train_y, batch_size = 32)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"l = next(train_generator)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(224, 224, 3)"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"l[0][0].shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 3/3\n",
"5881/5881 [==============================] - 1368s 233ms/step - loss: 0.2791 - accuracy: 0.9298 - true_positives_9: 641.2561 - false_positives_9: 2328.5566 - true_negatives_9: 525110.8125 - false_negatives_9: 36590.9922 - auc_9: 0.6685\n"
]
},
{
"data": {
"text/plain": [
"<keras.callbacks.callbacks.History at 0x7f0d874cfdd8>"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# create the base pre-trained model\n",
"base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(224,224,3))\n",
"for layer in base_model.layers[:-2]:\n",
" layer.trainable = False\n",
"\n",
"x = base_model.output\n",
"x = GlobalAveragePooling2D()(x)\n",
"#x = Dense(1024, activation='relu')(x)\n",
"#x = Dense(256, activation='relu')(x)\n",
"predictions = Dense(6, activation='softmax')(x)\n",
"\n",
"adam = keras.optimizers.Adam(learning_rate=0.00001,\n",
" beta_1=0.9,\n",
" beta_2=0.999,\n",
" amsgrad=False)\n",
"\n",
"# this is the model we will train\n",
"model = Model(inputs=base_model.input, outputs=predictions)\n",
"model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy',tf.keras.metrics.TruePositives(),tf.keras.metrics.FalsePositives(), tf.keras.metrics.TrueNegatives(),tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])\n",
"\n",
"BATCH_SIZE = 128\n",
"NUM_TRAIN_IMAGES = train_df.shape[0]\n",
"# Train model on dataset\n",
"model.fit_generator(train_generator, initial_epoch =2,epochs = 3,steps_per_epoch= NUM_TRAIN_IMAGES // BATCH_SIZE, verbose = 1)\n",
"\n",
"\n",
"# In[ ]:\n",
"\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model weights and architecture saved.\n"
]
}
],
"source": [
"\n",
"# Save Weights and architecture\n",
"model.save_weights('model_weights-BC_3.h5')\n",
"\n",
"\n",
"# Save the model architecture\n",
"with open('model_architecture-BC_3.json', 'w') as f:\n",
" f.write(model.to_json())\n",
"\n",
"print(\"Model weights and architecture saved.\")\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<generator object image_generator at 0x7efefb7da678>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_generator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Environment (conda_tensorflow2_p36)",
"language": "python",
"name": "conda_tensorflow2_p36"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 4
}